stopwords_file = open("d:/data/stopwords.tab", encoding='utf8')
stopwords_set = set()
while 1:
    line = stopwords_file.readline()
    if not line:
        break
    line = line.rstrip()
    if len(line) != 0:
        stopwords_set.add(line)
stopwords_file.close()

out = open("d:/data/videodata_seg_re.txt", "w", encoding='utf8')
file = open("d:/data/videodata_seg.txt", encoding='utf8')

import re
regex = re.compile("^\d+$")

cnt = 0
while 1:
    lines = file.readlines(1000000 * 100)
    if not lines:
        break
    for line in lines:
        new_line = ""
        line = line.rstrip()
        words = line.split(" ")
        for word in words:
            if len(word) == 0:
                continue
            if regex.search(word):
                continue
            if word not in stopwords_set:
                new_line += word + " "
        out.write(new_line + "\n")
        cnt += 1
        if cnt % 10000 == 0:
            print(cnt / 2560000.0)
file.close()
out.close()
